1 Intersection volume

1.1 Example

example <- jsonlite::fromJSON(here("data", "BCC", "traffic", "raw", "temp_02",
                                   "traffic-data-at-int-201809060626.json"), 
                              flatten = TRUE)

1.1.1 Structure

skim(example) %>% 
  skimr::kable()

Skim summary statistics
n obs: 2480
n variables: 20

Variable type: character
variable missing complete n min max empty n_unique
lane 0 2480 2480 4 6 0 705
married 0 2480 2480 1 1 0 2
recorded 0 2480 2480 19 19 0 2
Variable type: integer
variable missing complete n mean sd p0 p25 p50 p75 p100 hist
ct 0 2480 2480 80.44 28.29 20 60 80 96.5 150
dbid 0 2480 2480 1.3e+09 154.54 1.3e+09 1.3e+09 1.3e+09 1.3e+09 1.3e+09
ds1 21 2459 2480 18.85 24.39 0 0 9 31 146
ds2 1317 1163 2480 27.07 26.04 0 0 23 41 167
ds3 2298 182 2480 32.43 29.05 0 10.25 25 46.75 114
ds4 2473 7 2480 21.29 36 0 0 0 31 87
link_plan 0 2480 2480 1.35 0.78 0 1 1 2 3
mf1 21 2459 2480 3.76 6.09 0 0 2 5 52
mf2 1317 1163 2480 6.98 8.6 0 0 4 10 58
mf3 2298 182 2480 11.15 12.06 0 3 8 15 60
mf4 2473 7 2480 7.14 16.32 0 0 0 3 44
rf1 21 2459 2480 3.49 6.04 0 0 1 4 53
rf2 1317 1163 2480 6.59 8.53 0 0 4 9 55
rf3 2298 182 2480 10.79 12.01 0 2 7 14 59
rf4 2473 7 2480 6.71 15.23 0 0 0 3 41
ss 0 2480 2480 2482.38 271.89 2065 2228 2443.5 2775 3047
tsc 0 2480 2480 1295.85 2172.02 9 484 650.5 865 9011

1.1.2 Missing data

gg_miss_var(example, show_pct = TRUE)

1.2 Larger example

example <- here("data", "BCC", "traffic", "raw", "temp_02") %>% 
  fs::dir_ls(regexp = "\\.json$") %>% 
  map_df(jsonlite::fromJSON, flatten = TRUE, .id = "source")

saveRDS(example,  here("data", "BCC", "traffic", "clean", "example.Rds"))

1.2.1 Time

example %<>% 
  select(-source) %>% 
  mutate(recorded = anytime(recorded))

summary(example$recorded)
##                  Min.               1st Qu.                Median 
## "2018-09-06 00:00:00" "2018-09-06 11:02:00" "2018-09-06 23:42:00" 
##                  Mean               3rd Qu.                  Max. 
## "2018-09-06 23:15:07" "2018-09-07 10:46:00" "2018-09-08 00:00:00"

TODO: identify the gaps

TODO: find largest period without data

TODO: fill in the gaps

1.2.2 Duplicated rows

There are 16113 duplicated rows in the raw data - all occurences were removed

# isUnique(example$dbid)

example %<>% 
  distinct()

1.2.3 Temporal pattern

1.2.3.1 One intersection

Example from one intersection in St Lucia (tsc == 8074).

Measured flow (mf) aggregated for all lanes.

traffic_8074 <- example %>% 
  filter(tsc == 8074) %>% 
  mutate(ds = rowSums(select(., contains("ds")), na.rm = TRUE)) %>% 
  mutate(mf = rowSums(select(., contains("mf")), na.rm = TRUE)) %>% 
  mutate(rf = rowSums(select(., contains("rf")), na.rm = TRUE)) %>% 
  group_by(recorded, tsc) %>% 
  summarise(ds = sum(ds),
            mf = sum(mf),
            rf = sum(rf))

1.2.3.2 All intersections

All (ie. having lat long) across 2 days:

traffic_agg <- example %>% 
  mutate(ds = rowSums(select(., contains("ds")), na.rm = TRUE)) %>% 
  mutate(mf = rowSums(select(., contains("mf")), na.rm = TRUE)) %>% 
  mutate(rf = rowSums(select(., contains("rf")), na.rm = TRUE)) %>% 
  group_by(recorded, tsc) %>% 
  summarise(ds = sum(ds),
            mf = sum(mf),
            rf = sum(rf)) %>% 
  ungroup()

traffic_agg <- left_join(traffic_agg, 
                         select(intersect, tsc, coordinates.latLng.latitude, coordinates.latLng.longitude)) %>% 
  filter(!is.na(coordinates.latLng.longitude) & !is.na(coordinates.latLng.latitude))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

1.2.3.3 Hourly aggregates

1.2.3.4 Daily aggregates

1.2.4 Spatial pattern

1.2.4.1 One time period

Traffic aggregates across city at 8 AM:

traffic_agg %>% 
  filter(recorded == "2018-09-06 08:00:00") %>% 
  ggplot() + 
  geom_point(aes(x=coordinates.latLng.longitude, y=coordinates.latLng.latitude, size=mf), 
             colour = "darkgreen", alpha = 0.5, shape=20, stroke=FALSE) +
  scale_size_continuous(name="Measured flow", range=c(1, 12)) +
  theme_void() + coord_map() 

1.2.4.2 Full time period

Traffic aggregates across city for one day:

library(gganimate)

traffic_agg %>% 
  ggplot() + 
  geom_point(aes(x=coordinates.latLng.longitude, y=coordinates.latLng.latitude, size=mf), 
             colour = "darkgreen", alpha = 0.5, shape=20, stroke=FALSE) +
  scale_size_continuous(name="Measured flow", range=c(1, 12)) +
  theme_void() + coord_map() +
  labs(title = 'Time: {frame_time}') +
  transition_time(recorded)

1.3 Full data

There are 28569 files collected so far - better processing or HPC is needed for efficient load?

1.3.1 File size issues

In a dataset collected so far we can find some small files:

##  
## Variable ¦      Obs  Missing     Mean   StdDev      Min      Max 
## ---------+-------------------------------------------------------
##     size ¦    28569        0   672963   168293    50604  1.2e+07

Using some arbitrary treshold we can get rid of the small stuff?

table(file.info(files)$size < 17000)
## 
## FALSE 
## 28569

** TODO: They can be either manually deleted, selected in R, or processed in bash?**

Bash code?

sudo find . -name "*.json" -size -17k
sudo find . -name "*.json" -size -17k -delete

2 Session info

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 15063)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Australia.1252  LC_CTYPE=English_Australia.1252   
## [3] LC_MONETARY=English_Australia.1252 LC_NUMERIC=C                      
## [5] LC_TIME=English_Australia.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2   tmaptools_2.0-1  tmap_2.1-1       sf_0.6-3        
##  [5] kableExtra_0.9.0 janitor_1.1.1    naniar_0.4.0.0   statar_0.6.5    
##  [9] skimr_1.0.3      fs_1.2.6         jsonlite_1.5     anytime_0.3.1   
## [13] magrittr_1.5     readxl_1.1.0     here_0.1         forcats_0.3.0   
## [17] stringr_1.3.1    dplyr_0.7.6      purrr_0.2.5      readr_1.1.1     
## [21] tidyr_0.8.1      tibble_1.4.2     ggplot2_3.0.0    tidyverse_1.2.1 
## [25] pacman_0.4.6    
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-137       matrixStats_0.54.0 satellite_1.0.1   
##  [4] lubridate_1.7.4    webshot_0.5.1      RColorBrewer_1.1-2
##  [7] httr_1.3.1         rprojroot_1.3-2    mapview_2.6.0     
## [10] tools_3.5.1        backports_1.1.2    rgdal_1.3-4       
## [13] R6_2.3.0           KernSmooth_2.23-15 mgcv_1.8-24       
## [16] rgeos_0.3-28       spData_0.2.9.4     DBI_1.0.0         
## [19] lazyeval_0.2.1     colorspace_1.3-2   raster_2.6-7      
## [22] withr_2.1.2        sp_1.3-1           tidyselect_0.2.5  
## [25] leaflet_2.0.2      compiler_3.5.1     cli_1.0.1         
## [28] rvest_0.3.2        xml2_1.2.0         labeling_0.3      
## [31] scales_1.0.0       classInt_0.2-3     RApiDatetime_0.0.3
## [34] digest_0.6.18      rmarkdown_1.10     base64enc_0.1-3   
## [37] dichromat_2.0-0    pkgconfig_2.0.2    htmltools_0.3.6   
## [40] maps_3.3.0         highr_0.7          htmlwidgets_1.3   
## [43] rlang_0.2.2        rstudioapi_0.8     shiny_1.1.0       
## [46] bindr_0.1.1        crosstalk_1.0.0    Matrix_1.2-14     
## [49] Rcpp_0.12.19       munsell_0.5.0      visdat_0.5.1      
## [52] stringi_1.2.4      yaml_2.2.0         plyr_1.8.4        
## [55] grid_3.5.1         parallel_3.5.1     promises_1.0.1    
## [58] crayon_1.3.4       lattice_0.20-35    haven_1.1.2       
## [61] mapproj_1.2.6      hms_0.4.2          knitr_1.20        
## [64] pillar_1.3.0       stats4_3.5.1       XML_3.98-1.16     
## [67] glue_1.3.0         evaluate_0.12      data.table_1.11.8 
## [70] modelr_0.1.2       png_0.1-7          httpuv_1.4.5      
## [73] cellranger_1.1.0   gtable_0.2.0       assertthat_0.2.0  
## [76] mime_0.6           lwgeom_0.1-4       xtable_1.8-3      
## [79] broom_0.5.0        e1071_1.7-0        later_0.7.5       
## [82] class_7.3-14       viridisLite_0.3.0  units_0.6-1